"""U.S. ESG FEATURES SAMPLE"""
'''SAMPLE SMALLER AS ESG DATA BEGINS END OF 2001
ESG SAMPLE -
26,062 observations
768 firms
14,592 from developed nations - 17 countries
6,119 from developing nations - 19 countries
NON-ESG SAMPLE
85,553 observations
1858 firms'''
import pandas as pd
import numpy as np
import os
import seaborn as sns
import matplotlib.pyplot as plt
plt.style.use('bmh')
os.chdir('/Users/lockiemichalski/Documents/Files_Mac_before_clean/Desktop/UQ/RQ3_2020/Credit_rating_WRDS/Steps_sample/Files to upload rand/US_data/Draft2_data')
data = pd.read_csv('US_SAMPLE_ESG_ONLY.csv').iloc[:,1:]
data.sort_values(by='rating_rank', inplace=True)
print(str(len(data)) + ' observations')
print(str(len(data['Instrument'].unique())) + ' unique firms')
"WITHOUT FINANCIALS"
plt.figure(figsize = (20, 10), dpi = 200)
sns.countplot(x='Issuer Rating', data=data).set_title('All-classes')
# class C is not in graph as there are only 2 observations and for classifiers to run this has to removed.
"Function to plot the missing values"
def missing_value_plot(df,title):
null_counts = df.isnull().sum()/len(df)
null_counts = null_counts.sort_values()
plt.figure(figsize=(16,8))
plt.xticks(np.arange(len(null_counts))+0.5,null_counts.index,rotation='vertical')
plt.ylabel('fraction of rows with missing data (%)')
plt.bar(np.arange(len(null_counts)),null_counts)
plt.title(title)
plt.xlabel('Features')
financial_features = ['accrualq', 'aftret_eqq', 'aftret_equityq', 'aftret_invcapxq', 'at_turnq',
'bmq', 'capeiq', 'capital_ratioq', 'cash_conversionq', 'cash_debtq', 'cash_ltq',
'cash_ratioq', 'cfmq', 'curr_debtq', 'curr_ratioq', 'debt_atq', 'de_ratioq',
'debt_capitalq', 'debt_ebitdaq', 'debt_invcapq', 'divyieldq', 'dltt_beq',
'dprq', 'efftaxq', 'equity_invcapq', 'evmq', 'fcf_ocfq', 'gpmq', 'GProfq',
'int_debtq', 'int_totdebtq', 'intcov_ratioq', 'intcovq', 'inv_turnq',
'invt_actq', 'lt_atq', 'lt_debtq', 'lt_ppentq', 'mktcap', 'npmq', 'ocf_lctq',
'opmadq', 'opmbdq', 'pay_turnq', 'pcfq', 'pe_exiq', 'pe_incq', 'pe_op_basicq',
'pe_op_dilq', 'PEG_1yrforward', 'PEG_ltgforward', 'PEG_trailing', 'pretret_earnatq',
'pretret_noq', 'price_adj', 'price_unadj', 'profit_lctq', 'psq', 'ptbq', 'ptpmq',
'quick_ratioq', 'rd_saleq', 'rect_actq', 'rect_turnq', 'roaq', 'roceq', 'roeq',
'sale_equityq', 'sale_invcapq', 'sale_nwcq', 'short_debtq', 'totdebt_invcapq',
'gind', 'ggroup', 'gsector']
missing_value_plot(data[financial_features], 'Financial ratios - without gsector 40') # financial ratios missing data + 3 categorical industry
"""Financial ratios: divyield should be filled with 0 as these would be firms with no dividend,
therefore median not correct to use. This is probable for some of the debt ratios too as some firms might not actually
have any debt leading to such high missing values. rd_saleq needs to go. Similarly, PE ratio types exhibit
lots of missing data"""
categorical_ind = ['gind', 'ggroup', 'gsector']
missing_value_plot(data[['Rec_Total','Rec_Median','Rec_Low','Rec_High','Rec_SBuy','Rec_Buy','Rec_Hold',
'Rec_Sell','Rec_SSell','Rec_NoOpinion','LTG_Mean','Num_Analyst']], 'Analyst recommendations')
'''NOTE: Once a firm has a recommendation, all the columns have values. This missing data is prior to the
firm having any recommendations. What should we do about this?'''
categorical_inst = ['Top10InstOwn','NumInstBlockOwners','InstBlockOwn','NumInstOwners',
'MaxInstOwn','InstOwn','InstOwn_HHI','InstOwn_Perc']
missing_value_plot(data[categorical_inst], "Inst holdings")
macro_features = ['balance_on_trade_bop', 'business_confidence', 'cab%gdp', 'central_government_deficit',
'consumer_confidence', 'consumer_spending', 'current_account_balance',
'export_goods_bop', 'export_prices', 'exports_goods_services', 'foreign_trade_balance',
'government_bond', 'government_external_debt', 'government_spending',
'gross_fixed_capital_investment', 'import_prices', 'imports_goods_bop',
'imports_goods_services', 'industrial_production', 'international_reserves',
'labour_force_survey', 'merchandise_exports', 'merchandise_imports', 'money_supply_M0',
'money_supply_M1', 'money_supply_M2', 'overnight_rate', 'population',
'retail_sales', 'sov_outlook_rank','stock_index', 'terms_of_trade',
'unemployment_rate']
missing_value_plot(data[macro_features], 'Macro + Macro growth')
#non-missing df excluding ESG features with missing data - refer to sum stats GLOBAL csv file
''' 420 ESG features '''
'''Referred excel file Global sum stats '''
ESG_sum_stats = pd.read_csv('US_ESG_sum_stats.csv')
'''217 features have <5% missing values and 195 < 2% missing values'''
#Split it up to four plots
ESG_217_cols = list(ESG_sum_stats.iloc[:,1:218].columns)
list_ESG = np.array_split(ESG_217_cols, 4)
missing_value_plot(data[list(list_ESG[0])], "ESG first features")
missing_value_plot(data[list(list_ESG[1])], "ESG second features")
missing_value_plot(data[list(list_ESG[2])], "ESG third features")
missing_value_plot(data[list(list_ESG[3])], "ESG fourth features")
missing_value_plot(data[list(ESG_sum_stats.iloc[:,219:300].columns)], "ESG part of remaining")
categorical_inst = ['Top10InstOwn','NumInstBlockOwners','InstBlockOwn','NumInstOwners',
'MaxInstOwn','InstOwn','InstOwn_HHI','InstOwn_Perc']
categorical_macro = ['sov_rating_rank','sov_outlook_rank']
categorical_analyst = ['Rec_Total','Rec_Median','Rec_Low','Rec_High','Rec_SBuy','Rec_Buy','Rec_Hold',
'Rec_Sell','Rec_SSell','Rec_NoOpinion','LTG_Mean','Num_Analyst']
categorical_ind = ['gind', 'ggroup', 'gsector']
categorical = categorical_ind + categorical_inst + categorical_analyst + categorical_macro
'''Box plot of inst categorical features'''
fig, ax = plt.subplots(3, 3, figsize=(30, 30))
for var, subplot in zip(categorical_inst, ax.flatten()):
sns.boxplot(x='Issuer Rating',y=var,data=data, ax=subplot)
'''Box plot of analyst categorical features'''
fig, ax = plt.subplots(3, 3, figsize=(30, 30))
for var, subplot in zip(categorical_analyst, ax.flatten()):
sns.boxplot(x='Issuer Rating',y=var,data=data, ax=subplot)
'''Box plot of industry categorical features'''
fig, ax = plt.subplots(1, 3, figsize=(20, 5))
for var, subplot in zip(categorical_ind, ax.flatten()):
sns.boxplot(x='Issuer Rating',y=var,data=data, ax=subplot)
'''Box plot of financial ratio features - Capitalization'''
fin_capitalization=['capital_ratioq','equity_invcapq','debt_invcapq','totdebt_invcapq']
fig, ax = plt.subplots(2,2, figsize=(20, 10))
for var, subplot in zip(fin_capitalization, ax.flatten()):
sns.boxplot(x='Issuer Rating',y=var,data=data, ax=subplot)
'''Box plot of financial ratio features - Efficiency'''
fin_efficiency=['at_turnq','inv_turnq','pay_turnq','rect_turnq','sale_equityq','sale_invcapq','sale_nwcq']
fig, ax = plt.subplots(4,2, figsize=(30, 30))
for var, subplot in zip(fin_efficiency, ax.flatten()):
sns.boxplot(x='Issuer Rating',y=var,data=data, ax=subplot)
'''Box plot of financial ratio features - Financial Soundness'''
fin_finsoundness=['invt_actq','rect_actq','fcf_ocfq','ocf_lctq','cash_debtq',
'cash_ltq','cfmq','short_debtq','profit_lctq',
'curr_debtq','debt_ebitdaq','dltt_beq','int_debtq',
'int_totdebtq','lt_debtq','lt_ppentq']
fig, ax = plt.subplots(8,2, figsize=(30, 30))
for var, subplot in zip(fin_finsoundness, ax.flatten()):
sns.boxplot(x='Issuer Rating',y=var,data=data, ax=subplot)
'''Box plot of financial ratio features - Liquidity'''
fin_liquidity=['cash_ratioq','curr_ratioq','quick_ratioq','accrualq','rd_saleq']
fig, ax = plt.subplots(3,2, figsize=(30, 30))
for var, subplot in zip(fin_liquidity, ax.flatten()):
sns.boxplot(x='Issuer Rating',y=var,data=data, ax=subplot)
'''Box plot of financial ratio features - Profitability''' #NO FINANCIALS
fin_profitability = ['efftaxq','GProfq','aftret_eqq','aftret_equityq','aftret_invcapxq','gpmq','npmq','opmadq',
'opmbdq','pretret_earnatq','pretret_noq','ptpmq','roaq','roceq','roeq']
fig, ax = plt.subplots(8,2, figsize=(30, 30))
for var, subplot in zip(fin_profitability, ax.flatten()):
sns.boxplot(x='Issuer Rating',y=var,data=data, ax=subplot)
'''Box plot of financial ratio features - Profitability'''
fin_profitability = ['efftaxq','GProfq','aftret_eqq','aftret_equityq','aftret_invcapxq','gpmq','npmq','opmadq',
'opmbdq','pretret_earnatq','pretret_noq','ptpmq','roaq','roceq','roeq']
fig, ax = plt.subplots(8,2, figsize=(30, 30))
for var, subplot in zip(fin_profitability, ax.flatten()):
sns.boxplot(x='Issuer Rating',y=var,data=data, ax=subplot)
'''Box plot of financial ratio features - Solvency'''
fin_solvency = ['de_ratioq','debt_atq','lt_atq','debt_capitalq','intcovq','intcov_ratioq']
fig, ax = plt.subplots(3,2, figsize=(30, 30))
for var, subplot in zip(fin_solvency, ax.flatten()):
sns.boxplot(x='Issuer Rating',y=var,data=data, ax=subplot)
'''Box plot of financial ratio features - Valuation'''
fin_valuation = ['dprq','PEG_1yrforward','PEG_ltgforward','PEG_trailing','bmq','capeiq','divyieldq',
'evmq','pcfq','pe_exiq','pe_incq','pe_op_basicq','pe_op_dilq','psq','ptbq']
fig, ax = plt.subplots(8,2, figsize=(30, 30))
for var, subplot in zip(fin_valuation, ax.flatten()):
sns.boxplot(x='Issuer Rating',y=var,data=data, ax=subplot)
'''Box plot of financial ratio features - other'''
fin = ['mktcap','price_adj']
fig, ax = plt.subplots(1,3, figsize=(30, 10))
for var, subplot in zip(fin, ax.flatten()):
sns.boxplot(x='Issuer Rating',y=var,data=data, ax=subplot)
'''Macro and macro_growth features'''
macro_features_growth = [col for col in data if col.endswith('%')]
'''Box plot of numeric macro features'''
fig, ax = plt.subplots(9, 4, figsize=(40, 30))
for var, subplot in zip(macro_features, ax.flatten()):
sns.boxplot(x='Issuer Rating',y=var,data=data, ax=subplot)
'''Box plot of numeric macro growth features'''
fig, ax = plt.subplots(9, 4, figsize=(40, 30))
for var, subplot in zip(macro_features_growth, ax.flatten()):
sns.boxplot(x='Issuer Rating',y=var,data=data, ax=subplot)
"""ESG first 67 features"""
fig, ax = plt.subplots(14, 4, figsize=(40, 50))
for var, subplot in zip(list(list_ESG[0]), ax.flatten()):
sns.boxplot(x='Issuer Rating',y=var,data=data, ax=subplot)
"""ESG second 50 features"""
fig, ax = plt.subplots(14, 4, figsize=(40, 50))
for var, subplot in zip(list(list_ESG[1]), ax.flatten()):
sns.boxplot(x='Issuer Rating',y=var,data=data, ax=subplot)
fig, ax = plt.subplots(14, 4, figsize=(40, 50))
for var, subplot in zip(list(list_ESG[2]), ax.flatten()):
sns.boxplot(x='Issuer Rating',y=var,data=data, ax=subplot)
fig, ax = plt.subplots(14, 4, figsize=(40, 50))
for var, subplot in zip(list(list_ESG[2]), ax.flatten()):
sns.boxplot(x='Issuer Rating',y=var,data=data, ax=subplot)
# get the distribution of a feature per class with Yeo-Johnson transformation
from scipy.stats import anderson
from scipy.stats import kstest
#from scipy.stats import zscore
from scipy.stats import yeojohnson
import scipy.stats as st
'''Interested in each distribution by each class'''
def get_best_distribution_groupby(data,rating):
dist_names = ["norm", "lognorm","exponweib", "weibull_max",
"weibull_min", "pareto", "genextreme","logistic"]
dist_results = []
params = {}
for dist_name in dist_names:
dist = getattr(st, dist_name)
param = dist.fit(data)
params[dist_name] = param
# Applying the Kolmogorov-Smirnov test
D, p = kstest(data, dist_name, args=param)
dist_results.append((dist_name, p))
# select the best fitted distribution
best_dist, best_p = (max(dist_results, key=lambda item: item[1]))
# store the name of the best fit and its p value
print("Best fitting distribution " +str(rating)+": " +str(best_dist))
return best_dist
def dist_by_class(class_col, feature_col, df):
df = df[[class_col,feature_col]]
unique_ir = df[class_col].unique()
#rating_zscores = []
rating_yeojohnson = []
rating_dist = []
for rating in unique_ir:
data = df[df[class_col]==rating]
#z = zscore(data[feature_col].dropna())
xt, lmbda = yeojohnson(np.array(data[feature_col].dropna()))
if len(xt) == 0:
print('No obs')
else:
#rating_zscores.append(xt)
rating_yeojohnson.append(xt)
print(str(len(xt)) + " length without outliers")
"""For anderson:
dist must be 'norm', 'expon', 'gumbel', 'extreme1' or 'logistic'."""
and_darl = anderson(xt, 'norm') #and_darl
#print("and_darl_test: " + str(and_darl))
best_dist = get_best_distribution_groupby(xt,rating) #kstest
rating_dist.append(best_dist)
fig, ax = plt.subplots(5,5,
figsize=(30,30))
for rating,rating_dist,unique_rating,subplot in zip(rating_yeojohnson,rating_dist,
list(unique_ir), ax.flatten()):
sns.distplot(rating, ax=subplot).set_title(str(unique_rating)+' '+str(rating_dist))
dist_by_class('Issuer Rating', 'ESG Combined Score', data)